org 100h   ; assume ax=bx=0

  pop di   ; di=sp=0

;Prepare floating-point constants for SSE
;[0xfff0]=0xffe00000, [0xffe0]=0xffc00000, ... [0x8000]=0
PK:
  mov cl,4
  sub ax,0x20
PKL:
  push ax  ; x
  push bx  ; 0
  loop PKL ; store four times
  jnz PK   ; loop 2048 times -> 32kB; sp=0x8000

%define K(x) [0x8000 + 0x10*(x/0x20)]
%define K_0_25        K(0x3e80)  ; 0.25
%define K_EPS         K(0x3c20)  ; 0.009765625
%define K_TIME_DELTA  K(0x3c00)  ; 0.0078125
%define K_COLOR_SCALE K(0x4540)  ; 3072 = 30/EPS

%define K_MAX_STEP    K(0x3f80)  ; 1
%define K_NEG_ABS     K(0x8000)  ; -0 = 0x80000000 for -abs()
%define K_MINUS1      K(0xbf80)  ; -1

;For 16:9 screens; pixel aspect ratio = 1.008
%define K_X_SCALE     K(0x2fe0)  ; 1.75 * 2**-32: x -> ..0.875
%define K_Y_SCALE     K(0x2fa0)  ; 1.25 * 2**-32: y -> ..0.488

;For 4:3 screens; pixel aspect ratio = 0.96
;%define K_X_SCALE     K(0x2fc0)  ; 1.5 * 2**-32: x -> ..0.75
;%define K_Y_SCALE     K(0x2fc0)  ; 1.5 * 2**-32: y -> ..0.586

  mov al,13h
  int 10h
  push 0xa000
  pop es
  fninit

;Palette: diffuse + specular = L * clamp([0.25,H,1] + L^8/2)
  mov dx,3c8h
  xor ax,ax
  out dx,al
  inc dx
P:
  or bx,0b0000011100011111  ; bx = LLLLL... HHH.....

  push di    ; b=1
  push bx    ; g=H
  push 50    ; r=0.2

  mov al,bh
POW:
  mul al
  mov al,ah
  inc si
  jpo POW    ; 3 times
  xchg ax,cx
  shr cl,1   ; cl=L^8/2 (0..127)

MAD:
  pop ax     ; rgb
  add al,cl  ; al=L^8/2 + rgb
  jnc CLAMP
  salc       ; clamp to 255
CLAMP:
  mul bh     ; ah=L*clamp(L^8/2 + rgb)
  shr ax,10
  out dx,al
  dec si
  jpo MAD    ; 3 times

  inc bx
  jnz P

  fldz             ;| t

;For each frame: compute rotation matrices and constants
M fadd dword K_TIME_DELTA ;| t+=dt
  fld st0
  fsincos          ;| slowC slowS t
  fldl2e           ;| 1.442695 slowC slowS t
  fmul st3
  fsincos          ;| fastC fastS slowC slowS t

;Store each constant four times (for SSE)
  mov bx,0x7000 - 0x40
STORE:
  mov cx,4
STORE4:
  fst dword[bx]    ;bx-0x40  0x30  0x20  0x10  |0
  add bl,4         ;   fastC fastS slowC slowS |XY
  loop STORE4

  fstp st0
  jnz STORE        ; loop 4 times

%define COS  [bx-0x40]
%define SIN  [bx-0x30]

;For each pixel: store x,y coordinates
X mov bx,0x7000
  push bx
  mov cx,4
X4:
  mov ax,0xcccd
  mul di
  add dx,0x9b80
  mov [bx],ax
  mov [bx+2],dx
  add bl,4
  inc di
  loop X4      ; di+=4 bx=0xa010

  pop bx

%define INT_X [bx-1]  ; x = 2^32 * (-0.5..0.5)
%define INT_Y [bx]    ; y = 0xcccd * 320 * (-100..100) = 2^32 * (-0.3906..0.3906)

%define x xmm0 ; inputs (destroyed)
%define y xmm1
%define z xmm2
%define o xmm3 ; output: orbit trap
%define a xmm4 ; scratch, output: estimated distance
%define b xmm5 ; scratch
%define R xmm6 ; translation radius
%define Z xmm7 ; depth

;Trace a ray for 30 steps
  mov cl,30
  movaps Z,K_MINUS1 ; Z=-1
T call MAP          ;| map(X,Y,Z) Z
  addps Z,a         ; Z+=map(X,Y,Z)
  loop T

;Normal, ambient occlusion
  call MAP
  movaps [bx+0x10],a
  subps Z,K_EPS
  call MAP         ; a = map(X,Y,Z-EPS)
  subps a,[bx+0x10]; a = map(X,Y,Z-EPS) - map(X,Y,Z)

;Fog
;  minps Z,K1       ; Z = min(.9-Z,1)
;  mulps a,Z        ; a *= Z

;Store pixel
  mulps a,K_COLOR_SCALE
  cvtps2dq a,a
  packssdw a,a
  packuswb a,a     ; clamp to a byte 0..255
  movd [es:di-4],a

;  movd eax,a
;  movaps a,o
;  xchg eax,ebp
;  xor bx,32
;  jpo STORE
;  lea eax,[ebp*8+eax]
;  stosd

  test di,di
  jnz X

  in al,0x60
  dec al
  jnz M   ; fallthrough

MAP:
  movups x,INT_X
  cvtdq2ps y,INT_Y
  cvtdq2ps x,x
  mulps x,K_X_SCALE ; x: -1..1
  mulps y,K_Y_SCALE
  movaps z,Z    ; x,y,z = X,Y,Z

  xorps o,o    ; o=0
  movaps R,K_0_25 ; R=K: translation = [R,R/4,0]
  mov ch,19    ; do 19 iterations

;Rotate in the xz and yx planes
L movaps b,SIN ; b=fastS a=fastC | b=slowS a=slowC
  movaps a,COS
  mulps b,z    ; b=Sz
  mulps z,a    ; z=Cz
  mulps a,x    ; a=Cx
  mulps x,SIN  ; x=Sx
  addps a,b    ; a=x'=Cx+Sz
  subps z,x    ; z=z'=Cz-Sx

  movaps x,y   ; cycle x,y,z <- y,z,a
  movaps y,z
  movaps z,a
  xor bl,0x20  ; 0x00 | 0x20
  jpo L

;Reflect along x and y
  orps x,K_NEG_ABS  ; x=-abs(x)
  orps y,K_NEG_ABS  ; y=-abs(y)

;Translate
  movaps a,R
  mulps a,K_0_25 ; a=K*R
  addps x,R    ; x+=R
  addps y,a    ; y+=K*R
  subps R,a    ; R*=1-K: scale translation vector

;Squared distance to [0,0,0]
  movaps a,x
  movaps b,y
  mulps a,a    ; a=x*x
  mulps b,b    ; b=y*y
  addps b,a    ; b=x*x+y*y
  movaps a,z
  mulps a,a    ; a=z*z
  addps b,a    ; b=length^2=x*x+y*y+z*z

;Orbit trap
  maxps o,b    ; o = max(o,length^2)

;Iterate 19 times
  dec ch
  jnz L

;Distance to a little sphere
  rsqrtps a,b  ; a=(length^2)^(-1/2)
  mulps a,b    ; a=(length^2)^(-1/2 + 1) = length

  subps a,R
  subps a,R    ; a=length-2R: offset by 2*radius
  minps a,K_MAX_STEP ; a=min(length-2R, MAX_STEP)
  ret
